
from docx import Document
from pathlib import Path
import re
from hashlib import md5
import json
from collections import Counter


def hash(name):
    c = md5()
    c.update(name.encode('utf-8'))
    return c.hexdigest()


def main():
    doc_dir = r'D:\myworks\合同审查项目\合同数据'
    count = 0
    for file in Path(doc_dir).rglob('*.*'):
        if file.as_posix().find('~$') != -1:
            continue
        count += 1
        if count != 3:
            continue
        print(file.as_posix())
        doc = Document(file)
        for para in doc.paragraphs:
            bold_texts = []
            for run in para.runs:
                if run.bold:
                    bold_texts.append(run.text)
                else:
                    break
            title = ''.join(bold_texts)
            print(title, ' --- ', para.text)


if __name__  == '__main__':
    main()